# Kalmia analyze fruit size at end of season
# Using fruit sizes calculated from image segmentation in Python with opencv
# Callin Switzer
# 20 October 2016
# 10 Feb 2017 Update: Conducted Statistical Modeling with LMER and GLMER
Read in data
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if(length(new.pkg)) install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("ggplot2", 'lme4', 'plyr', 'influence.ME', 'sjPlot', 'multcomp', 'lsmeans', 'Matrix')
ipak(packages)
## ggplot2 lme4 plyr influence.ME sjPlot
## TRUE TRUE TRUE TRUE TRUE
## multcomp lsmeans Matrix
## TRUE TRUE TRUE
theme_set(theme_classic())
kfrt <- read.csv("kalmiaFruitFinal.csv", stringsAsFactors = FALSE)
nrow(kfrt[kfrt$dia_mm == 20.0,]) # number of images taken
## [1] 92
# clean and process data
kfrt <- kfrt[kfrt$dia_mm != 20.0, ] # 20 is the reference object in segmented images
nrow(kfrt) # number of total fruits measured
## [1] 1305
# label treatmens and access numbers
kfrt$trt <- sapply(X = 1:nrow(kfrt), FUN = function(x) strsplit(kfrt$plantNum[x], split = "__")[[1]][2])
kfrt$accessNum <- sapply(X = 1:nrow(kfrt), FUN = function(x) strsplit(kfrt$plantNum[x], split = "__")[[1]][1])
kfrt$trt <- mapvalues(kfrt$trt, c(1,2,3,4,5), c("Bagged", "Bagged & Selfed", "Unbagged",
"Unbagged & Outcrossed", "Unbagged_2"))
Add plants with 0 count to the dataset
# count number of fruits
counts = as.data.frame(table(kfrt$plantNum))
counts$trt = sapply(X = 1:nrow(counts), FUN = function(x) strsplit(as.character(counts$Var1[x]), split = "__")[[1]][2])
counts$trt <- mapvalues(counts$trt, c(1,2,3,4,5), c("Bagged", "Bagged & Selfed", "Unbagged",
"Unbagged & Outcrossed", "Unbagged_2"))
counts$accessNum = sapply(X = 1:nrow(counts), FUN = function(x) strsplit(as.character(counts$Var1[x]), split = "__")[[1]][1])
# add in the trts and accession numbers that had counts of 0
# read in error-checked datasheet
kalNotes <- read.csv("KalmiaDailyDatasheets_ErrorChecked.csv")
# get the accession numbers I should have
accNumsHave <- unique(kalNotes$Plant.Number)
accNumsHave <- accNumsHave[!(accNumsHave %in% c("", "Plant Number"))]
#change formatting to match above
accNumsHave <- gsub("#", "", accNumsHave)
accNumsHave <- gsub("\ ", "_", accNumsHave)
accNumsHave <- gsub("\\-", "_", accNumsHave)
accNumsHave <- gsub("\\*", "_", accNumsHave)
accNumsHave <-toupper(accNumsHave)
shouldHave <- as.data.frame(t(sapply(accNumsHave, function(x) as.character(paste(x, c(1:4), sep = "__")))))
row.names(shouldHave) <- NULL
shouldHave1 <- c(as.character(shouldHave[, 1]),
as.character(shouldHave[, 2]),
as.character(shouldHave[, 3]),
as.character(shouldHave[, 4]))
# find missing ones -- this is the ones that
# are in the daily datasheet that aren't in the fruit measurements
missingTrts <- setdiff(shouldHave1,unique(kfrt$plantNum[kfrt$trt != "5"]))
# this should be 0 -- the ones from the fruit measurements that aren't in the daily datasheet
setdiff(unique(kfrt$plantNum[kfrt$trt != "Unbagged_2"]), shouldHave1)
## character(0)
# here's the list of plants that had their bags/tags go missing during the experiment (meaning that
# I was unable to collect fruits)
# note: "677_66_MASS__1" was the only plant that was missing a tag during the final collection
# that wasn't missing in one of my previous checks.
NAList <- c("1129_74_E__4", "1129_74_C__4", "39_60_A__3", "685_70_A__2", "677_66_MASS__1")
# note: this ignores trt#5 which is the same as #3
ZeroFruitPlants <- missingTrts[!(missingTrts %in% NAList)]
zfdf <- data.frame(Var1 = ZeroFruitPlants,
Freq = 0,
trt = sapply(X = 1:length(ZeroFruitPlants),
FUN = function(x) strsplit(as.character(ZeroFruitPlants[x]),
split = "__")[[1]][2]),
accessNum = sapply(X = 1:length(ZeroFruitPlants),
FUN = function(x) strsplit(as.character(ZeroFruitPlants[x]),
split = "__")[[1]][1])
)
zfdf$trt <- mapvalues(zfdf$trt, c(1,2,3,4,5), c("Bagged", "Bagged & Selfed", "Unbagged",
"Unbagged & Outcrossed", "Unbagged_2"))
## The following `from` values were not present in `x`: 3, 4, 5
countFin <- rbind(counts, zfdf)
# final fruit counts for the fruits collected at the end of the experiment
countFin <- countFin[order(countFin$accessNum, countFin$trt, decreasing = FALSE), ]
# change labels from unbagged to conntrol
countFin$trt <- mapvalues(countFin$trt, c("Bagged", "Bagged & Selfed", "Unbagged",
"Unbagged & Outcrossed", "Unbagged_2")
, c("Bagged", "Bagged & Selfed", "Control",
"Unbagged & Outcrossed", "Control_2"))
# change levels, so that control is first
countFin$trt <- factor(countFin$trt, levels = c("Control","Control_2", "Bagged", "Bagged & Selfed",
"Unbagged & Outcrossed"))
Visualize counts of fruits
ggplot(countFin, aes(x = trt , y = Freq, fill = trt)) +
geom_boxplot() +
# geom_violin()+
labs(y = "Number of fruits", x = "Treatment") +
scale_fill_brewer(name = "Treatment", palette = "Set1")

saveDir <- "/Users/callinswitzer/Dropbox/ExperSummer2016/Kalmia/Manuscript/Media/"
ggsave(paste0(saveDir, "KalmiaFruitNumber.pdf"), width = 10, height = 8)
Visualize counts of fruits (ignore treatment #5)
ggplot(countFin[!(countFin$trt %in% 'Control_2'), ], aes(x = trt , y = Freq, fill = trt)) +
geom_boxplot() +
labs(y = "Number of fruits", x = "Treatment") +
theme(axis.text.x = element_text(angle = 35, hjust = 1),
legend.position = 'none') +
scale_fill_brewer(name = "Treatment", palette = "Set1")

ggsave(paste0(saveDir, "KalmiaFruitNumber_trt1_4Only.pdf"), width = 5, height = 4)
Use GLMM’s to find differences in number of fruits
First, summarize the dataset that I will be using
# I'm ignoring control#2, because it was originally intended for another experiment (analysis wasn't planned)
cf1 <- countFin[countFin$trt != "Control_2",]
nrow(countFin) # number of total counts, including Control2
## [1] 103
sum(cf1$Freq) # total number of fruits in the analysis of count
## [1] 1035
# Number of counts, excluding control_2
# this is different from the number of photos
# because I didn't take photos on 0-count plants
nrow(cf1)
## [1] 83
data.frame(table(cf1$trt)) # number of plants in each treatment that we are analyzing
## Var1 Freq
## 1 Control 21
## 2 Control_2 0
## 3 Bagged 21
## 4 Bagged & Selfed 21
## 5 Unbagged & Outcrossed 20
data.frame(table(cf1$accessNum)) # shows number of counts per plant -- 4 means that we counted all four treatments
## Var1 Freq
## 1 1129_74_A 4
## 2 1129_74_B 4
## 3 1129_74_C 3
## 4 1129_74_E 3
## 5 1129_74_F 4
## 6 12_2006_A 4
## 7 128_96_MASS 4
## 8 132_98_MASS 4
## 9 150_58_A 4
## 10 232_46_A 4
## 11 35_40_C 4
## 12 39_60_A 3
## 13 425_74_D 4
## 14 440_66_A 4
## 15 46_18_A 4
## 16 572_64_MASS 4
## 17 624_64_B 4
## 18 673_69_B 4
## 19 677_66_MASS 3
## 20 685_70_A 3
## 21 721_79_B 4
## 22 UNLABELED_1 4
# shows which treatments / plants are missing from analysis
# this is different from the plants that just had 0 counts
data.frame(table(interaction(cf1$accessNum , cf1$trt)))
## Var1 Freq
## 1 1129_74_A.Control 1
## 2 1129_74_B.Control 1
## 3 1129_74_C.Control 1
## 4 1129_74_E.Control 1
## 5 1129_74_F.Control 1
## 6 12_2006_A.Control 1
## 7 128_96_MASS.Control 1
## 8 132_98_MASS.Control 1
## 9 150_58_A.Control 1
## 10 232_46_A.Control 1
## 11 35_40_C.Control 1
## 12 39_60_A.Control 0
## 13 425_74_D.Control 1
## 14 440_66_A.Control 1
## 15 46_18_A.Control 1
## 16 572_64_MASS.Control 1
## 17 624_64_B.Control 1
## 18 673_69_B.Control 1
## 19 677_66_MASS.Control 1
## 20 685_70_A.Control 1
## 21 721_79_B.Control 1
## 22 UNLABELED_1.Control 1
## 23 1129_74_A.Control_2 0
## 24 1129_74_B.Control_2 0
## 25 1129_74_C.Control_2 0
## 26 1129_74_E.Control_2 0
## 27 1129_74_F.Control_2 0
## 28 12_2006_A.Control_2 0
## 29 128_96_MASS.Control_2 0
## 30 132_98_MASS.Control_2 0
## 31 150_58_A.Control_2 0
## 32 232_46_A.Control_2 0
## 33 35_40_C.Control_2 0
## 34 39_60_A.Control_2 0
## 35 425_74_D.Control_2 0
## 36 440_66_A.Control_2 0
## 37 46_18_A.Control_2 0
## 38 572_64_MASS.Control_2 0
## 39 624_64_B.Control_2 0
## 40 673_69_B.Control_2 0
## 41 677_66_MASS.Control_2 0
## 42 685_70_A.Control_2 0
## 43 721_79_B.Control_2 0
## 44 UNLABELED_1.Control_2 0
## 45 1129_74_A.Bagged 1
## 46 1129_74_B.Bagged 1
## 47 1129_74_C.Bagged 1
## 48 1129_74_E.Bagged 1
## 49 1129_74_F.Bagged 1
## 50 12_2006_A.Bagged 1
## 51 128_96_MASS.Bagged 1
## 52 132_98_MASS.Bagged 1
## 53 150_58_A.Bagged 1
## 54 232_46_A.Bagged 1
## 55 35_40_C.Bagged 1
## 56 39_60_A.Bagged 1
## 57 425_74_D.Bagged 1
## 58 440_66_A.Bagged 1
## 59 46_18_A.Bagged 1
## 60 572_64_MASS.Bagged 1
## 61 624_64_B.Bagged 1
## 62 673_69_B.Bagged 1
## 63 677_66_MASS.Bagged 0
## 64 685_70_A.Bagged 1
## 65 721_79_B.Bagged 1
## 66 UNLABELED_1.Bagged 1
## 67 1129_74_A.Bagged & Selfed 1
## 68 1129_74_B.Bagged & Selfed 1
## 69 1129_74_C.Bagged & Selfed 1
## 70 1129_74_E.Bagged & Selfed 1
## 71 1129_74_F.Bagged & Selfed 1
## 72 12_2006_A.Bagged & Selfed 1
## 73 128_96_MASS.Bagged & Selfed 1
## 74 132_98_MASS.Bagged & Selfed 1
## 75 150_58_A.Bagged & Selfed 1
## 76 232_46_A.Bagged & Selfed 1
## 77 35_40_C.Bagged & Selfed 1
## 78 39_60_A.Bagged & Selfed 1
## 79 425_74_D.Bagged & Selfed 1
## 80 440_66_A.Bagged & Selfed 1
## 81 46_18_A.Bagged & Selfed 1
## 82 572_64_MASS.Bagged & Selfed 1
## 83 624_64_B.Bagged & Selfed 1
## 84 673_69_B.Bagged & Selfed 1
## 85 677_66_MASS.Bagged & Selfed 1
## 86 685_70_A.Bagged & Selfed 0
## 87 721_79_B.Bagged & Selfed 1
## 88 UNLABELED_1.Bagged & Selfed 1
## 89 1129_74_A.Unbagged & Outcrossed 1
## 90 1129_74_B.Unbagged & Outcrossed 1
## 91 1129_74_C.Unbagged & Outcrossed 0
## 92 1129_74_E.Unbagged & Outcrossed 0
## 93 1129_74_F.Unbagged & Outcrossed 1
## 94 12_2006_A.Unbagged & Outcrossed 1
## 95 128_96_MASS.Unbagged & Outcrossed 1
## 96 132_98_MASS.Unbagged & Outcrossed 1
## 97 150_58_A.Unbagged & Outcrossed 1
## 98 232_46_A.Unbagged & Outcrossed 1
## 99 35_40_C.Unbagged & Outcrossed 1
## 100 39_60_A.Unbagged & Outcrossed 1
## 101 425_74_D.Unbagged & Outcrossed 1
## 102 440_66_A.Unbagged & Outcrossed 1
## 103 46_18_A.Unbagged & Outcrossed 1
## 104 572_64_MASS.Unbagged & Outcrossed 1
## 105 624_64_B.Unbagged & Outcrossed 1
## 106 673_69_B.Unbagged & Outcrossed 1
## 107 677_66_MASS.Unbagged & Outcrossed 1
## 108 685_70_A.Unbagged & Outcrossed 1
## 109 721_79_B.Unbagged & Outcrossed 1
## 110 UNLABELED_1.Unbagged & Outcrossed 1
Create a model
m1 <- glmer(Freq ~ trt + (1|accessNum), family = poisson, data = cf1)
summary(m1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: Freq ~ trt + (1 | accessNum)
## Data: cf1
##
## AIC BIC logLik deviance df.resid
## 699.1 711.2 -344.5 689.1 78
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.8567 -1.2750 -0.3495 1.1718 5.2621
##
## Random effects:
## Groups Name Variance Std.Dev.
## accessNum (Intercept) 0.4623 0.6799
## Number of obs: 83, groups: accessNum, 22
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.18834 0.16080 13.609 <2e-16 ***
## trtBagged -0.90399 0.12302 -7.348 2e-13 ***
## trtBagged & Selfed 0.18797 0.08836 2.127 0.0334 *
## trtUnbagged & Outcrossed 0.74707 0.08352 8.945 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) trtBgg trtB&S
## trtBagged -0.219
## trtBggd&Slf -0.301 0.395
## trtUnbggd&O -0.334 0.418 0.583
# calculate LRT for trt
m2 <- update(m1, .~. - trt)
anova(m1, m2) # highly significant
## Data: cf1
## Models:
## m2: Freq ~ (1 | accessNum)
## m1: Freq ~ trt + (1 | accessNum)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## m2 2 961.54 966.38 -478.77 957.54
## m1 5 699.08 711.17 -344.54 689.08 268.46 3 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model diagnostics
# diagnostics
# qq plot
qqnorm(resid(m1), main = "")
qqline(resid(m1)) # good, one possible outlier

# residual plot
plot(fitted(m1), resid(m1), xlab = "fitted", ylab = "residuals")
abline(0,0)

# possibly one or two outliers
# QQPlot for group-level effects
qqnorm(ranef(m1)$accessNum[[1]], main="Normal Q-Q plot for random effects")
qqline(ranef(m1)$accessNum[[1]]) # looks good

infl <- influence(m1, obs = TRUE) # takes a while to calculate
plot(infl, which = 'cook') # nothing above 1

# visualize model:
sjp.lmer(m1, type = 'fe')

sjp.lmer(m1, type = 're', sort = TRUE) # plot random effects to look for outliers
## Plotting random effects...

#check assumptions of model
overdisp_fun <- function(model) {
## number of variance parameters in
## an n-by-n variance-covariance matrix
vpars <- function(m) {
nrow(m)*(nrow(m)+1)/2
}
model.df <- sum(sapply(VarCorr(model),vpars))+length(fixef(model))
rdf <- nrow(model.frame(model))-model.df
rp <- residuals(model,type="pearson")
Pearson.chisq <- sum(rp^2)
prat <- Pearson.chisq/rdf
pval <- pchisq(Pearson.chisq, df=rdf, lower.tail=FALSE)
c(chisq=Pearson.chisq,ratio=prat,rdf=rdf,p=pval)
}
overdisp_fun(m1)
## chisq ratio rdf p
## 2.613667e+02 3.350855e+00 7.800000e+01 1.230697e-21
# here's another way to check for overdispersion
residDev <- sum(residuals(m1, type = 'deviance')^2) # calculate residual deviance
# this ratio should be about 1 -- larger than 1 suggests overdispersion
residDev / df.residual(m1)
## [1] 3.918246
Use negative binomial model, since the poisson model is overdispersed
m1.1 <- glmer.nb(Freq ~ trt + (1|accessNum), data = cf1)
summary(m1.1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: Negative Binomial(2.3758) ( log )
## Formula: Freq ~ trt + (1 | accessNum)
## Data: cf1
##
## AIC BIC logLik deviance df.resid
## 568.2 582.7 -278.1 556.2 77
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.44802 -0.59594 -0.05952 0.47425 2.46419
##
## Random effects:
## Groups Name Variance Std.Dev.
## accessNum (Intercept) 0.4418 0.6647
## Number of obs: 83, groups: accessNum, 22
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.23813 0.21591 10.366 < 2e-16 ***
## trtBagged -1.01254 0.24831 -4.078 4.55e-05 ***
## trtBagged & Selfed 0.01286 0.23231 0.055 0.95585
## trtUnbagged & Outcrossed 0.76456 0.22895 3.339 0.00084 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) trtBgg trtB&S
## trtBagged -0.487
## trtBggd&Slf -0.519 0.469
## trtUnbggd&O -0.539 0.447 0.465
# get overall p-value for treatment
m2.1 <- update(m1.1, .~. - trt)
anova(m1.1, m2.1, test = 'LRT')
## Data: cf1
## Models:
## m2.1: Freq ~ (1 | accessNum)
## m1.1: Freq ~ trt + (1 | accessNum)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## m2.1 3 613.21 620.47 -303.61 607.21
## m1.1 6 568.15 582.67 -278.08 556.15 51.058 3 4.756e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Negative binomial model diagnostics
overdisp_fun(m1.1)
## chisq ratio rdf p
## 57.3160985 0.7348218 78.0000000 0.9621017
# here's another way to check for overdispersion
residDev <- sum(residuals(m1.1, type = 'deviance')^2) # calculate residual deviance
# this ratio should be about 1 -- larger than 1 suggests overdispersion
residDev / df.residual(m1.1)
## [1] 1.103789
# diagnostics
# qq plot
qqnorm(resid(m1.1), main = "")
qqline(resid(m1.1)) # good, one possible outlier

# residual plot
plot(fitted(m1.1), resid(m1.1), xlab = "fitted", ylab = "residuals")
abline(0,0) # residuals look better for this model

# QQPlot for group-level effects
qqnorm(ranef(m1.1)$accessNum[[1]], main="Normal Q-Q plot for random effects")
qqline(ranef(m1.1)$accessNum[[1]]) # looks good

infl <- influence(m1.1, obs = TRUE) # takes a while to calculate
plot(infl, which = 'cook') # nothing above 1

# visualize model:
sjp.lmer(m1.1, type = 'fe')

sjp.lmer(m1.1, type = 're', sort = TRUE) # plot random effects to look for outliers
## Plotting random effects...

# get estimated dispersion parameter for NB Model
getME(m1.1, "glmer.nb.theta") # 2.37
## [1] 2.375765
# post-hoc pairwise comparisons with adjusted p-values
# from documentation:
# test = adjusted() multiple test procedures as specified by the type argument
# to adjusted: "single-step" denotes adjusted p values as computed
# from the joint normal or t distribution of the z statistics (default),
summary(glht(m1.1, lsm(pairwise ~ trt)), test=adjusted("single-step")) # pairwise tests for fruit counts
##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: glmer(formula = Freq ~ trt + (1 | accessNum), data = cf1, family = negative.binomial(theta = 2.37576455660569))
##
## Linear Hypotheses:
## Estimate Std. Error z value
## Control - Bagged == 0 1.01254 0.24831 4.078
## Control - Bagged & Selfed == 0 -0.01286 0.23231 -0.055
## Control - Unbagged & Outcrossed == 0 -0.76456 0.22895 -3.339
## Bagged - Bagged & Selfed == 0 -1.02541 0.24792 -4.136
## Bagged - Unbagged & Outcrossed == 0 -1.77711 0.25143 -7.068
## Bagged & Selfed - Unbagged & Outcrossed == 0 -0.75170 0.23857 -3.151
## Pr(>|z|)
## Control - Bagged == 0 < 0.001 ***
## Control - Bagged & Selfed == 0 0.99994
## Control - Unbagged & Outcrossed == 0 0.00466 **
## Bagged - Bagged & Selfed == 0 < 0.001 ***
## Bagged - Unbagged & Outcrossed == 0 < 0.001 ***
## Bagged & Selfed - Unbagged & Outcrossed == 0 0.00880 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
Visualize annotated plot for fruit counts
count_pub <- within(countFin, {
trt <- mapvalues(trt, from = c("Control","Control_2", "Bagged", "Bagged & Selfed",
"Unbagged & Outcrossed"),
to = c("Control","Control_2", "Bagged", "Bagged\n & Selfed",
"Outcrossed"))
})
count_pub <- droplevels(count_pub)
ggplot(count_pub[!(count_pub$trt %in% 'Control_2'), ], aes(x = trt , y = Freq+ 0.1)) +
geom_boxplot(width = 0.2, fill = 'grey80') +
labs(y = "Number of fruits", x = "Treatment") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = 'none') +
# annotate
annotate(geom="text", x=c(1,2,3,4), y=c(70, 70, 70, 70) + 10, label=c("a", "b", "a", "c"),
color="black") +
scale_y_log10(breaks = c(1, 10, 100), limits = c(0.1, 100))

ggsave(paste0(saveDir, "KalmiaFruitNumber_logScale.pdf"), width = 3, height = 4)
Visualize CI’s for each of the groups
# refit m1.1 with treatments in alphabetical order (b/c order isn't preserved in model matrix)
cf1.1 <- within(cf1, {
trt = as.factor(as.character(cf1$trt))
})
# refit model with different reference levels
m1.1 <- glmer.nb(Freq ~ trt + (1|accessNum), data = cf1.1)
pframe <- data.frame(trt = levels(droplevels(cf1.1$trt)))
pframe$Freq <- 0
pp <- predict(m1.1, newdata = pframe, re.form=NA, type = 'response') # re.form sets all random effects to 0
### Calculate CI's (using bootstrap, not accounting for random effects)
mm <- model.matrix(terms(m1.1), pframe)
predFun<-function(.) exp(mm%*%fixef(.) )
bb<-bootMer(m1.1,FUN=predFun,nsim=200) #do this 200 times
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00625995 (tol =
## 0.001, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.0073433 (tol =
## 0.001, component 1)
#as we did this 200 times the 95% CI will be bordered by the 5th and 195th value
bb_se<-apply(bb$t,2,function(x) quantile(x, probs = c(0.025, 0.975)))
pframe$blo<-bb_se[1,]
pframe$bhi<-bb_se[2,]
pframe$predMean <- pp
pframe$median = tapply(cf1.1$Freq, INDEX = cf1.1$trt, median)
pframe$trt <- mapvalues(pframe$trt, from = c("Control", "Bagged", "Bagged & Selfed",
"Unbagged & Outcrossed"),
to = c("Control", "Bagged", "Bagged\n & Selfed",
"Outcrossed"))
pframe$trt <- relevel(pframe$trt, ref = "Control")
#plot 95% confidence intervals
# "Mean and bootstrap CI based on fixed-effects uncertainty ONLY"
number_ticks <- function(n) {function(limits) pretty(limits, n)}
g0 <- ggplot(pframe, aes(x=trt, y=predMean))+
geom_point()+
labs(y = "Number of fruits", x = "Treatment") +
geom_errorbar(aes(ymin = blo, ymax = bhi), width = 0.1)+
scale_y_log10(limits =c(1,60), breaks = number_ticks(6)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = 'none') +
# annotate
annotate(geom="text", x=c(1,2,3,4), y=c(70, 70, 70, 70) - 10, label=c("a", "b", "a", "c"),
color="black")
g0

ggsave(paste0(saveDir, "KalmiaFruitNumber_BSCI_logScale.pdf"), width = 3, height = 4)
## Bootstrap CI on linear scale -- not that great!
g1 <- ggplot(pframe, aes(x=trt, y=predMean))+
geom_point()+
labs(y = "Number of Fruits", x = "Treatment") +
geom_errorbar(aes(ymin = blo, ymax = bhi), width = 0.1)+
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = 'none') +
# annotate
annotate(geom="text", x=c(1,2,3,4), y=c(70, 70, 70, 70) - 30, label=c("a", "b", "a", "c"),
color="black")
g1

ggsave(paste0(saveDir, "KalmiaFruitNumber_BSCI_LinearScale.pdf"), width = 3, height = 4)
cp <- droplevels(count_pub[count_pub$trt != 'Control_2',])
# this might actually be the best way
ggplot(cp, aes(x = trt , y = Freq)) +
geom_boxplot(width = 0.2, fill = 'grey80') +
labs(y = "Number of fruits", x = "Treatment") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = 'none') +
# annotate
annotate(geom="text", x=c(1,2,3,4), y=c(70, 70, 70, 70) + 1, label=c("a", "b", "a", "c"),
color="black")

ggsave(paste0(saveDir, "KalmiaFruitNumber_LinearScale.pdf"), width = 3, height = 4)
Visualize fruit size
# calculate fruit size by plant
sizeDF_mean <- as.data.frame(tapply(kfrt$dia_mm, INDEX = kfrt$plantNum, mean))
colnames(sizeDF_mean) = "meanFrtSz"
sizeDF_mean$trt = sapply(X = 1:length(sizeDF_mean$meanFrtSz),
FUN = function(x) strsplit(as.character(row.names(sizeDF_mean)[x]),
split = "__")[[1]][2])
sizeDF_mean$trt <- mapvalues(sizeDF_mean$trt, c(1,2,3,4,5), c("Bagged", "Bagged & Selfed", "Control",
"Unbagged & Outcrossed", "Control_2"))
# reorder
sizeDF_mean$trt <- factor(sizeDF_mean$trt, levels = c("Control", "Control_2", "Bagged", "Bagged & Selfed",
"Unbagged & Outcrossed"))
sizeDF_mean$accessNum = sapply(X = 1:length(sizeDF_mean$meanFrtSz),
FUN = function(x) strsplit(as.character(row.names(sizeDF_mean)[x]),
split = "__")[[1]][1])
ggplot(sizeDF_mean, aes(x = trt, y = meanFrtSz, fill = trt)) +
geom_boxplot(alpha = 0.5) +
# stat_summary(fun.y=mean, geom="line", aes(group = accessNum, color = accessNum)) +
# stat_summary(fun.y=mean, geom="point", aes(group = accessNum, color = accessNum)) +
geom_point(aes(color = trt))

ggplot(sizeDF_mean, aes(x = trt, y = meanFrtSz, fill = trt)) +
geom_boxplot() +
labs(y = "Mean Fruit Diameter (mm)", x = "Treatment") +
scale_fill_brewer(name = "Treatment", palette = "Set1")

ggsave(paste0(saveDir, "KalmiaFruitDiameter.pdf"), width = 10, height = 8)
# visualize , but exclude treatment 5
ggplot(sizeDF_mean[!(sizeDF_mean$trt %in% 'Control_2'), ],
aes(x = trt, y = meanFrtSz, fill = trt)) +
geom_boxplot() +
labs(y = "Mean Fruit Diameter (mm)", x = "Treatment") +
scale_fill_brewer(name = "Treatment", palette = "Set1") +
theme(axis.text.x = element_text(angle = 35, hjust = 1),
legend.position = 'none')

ggsave(paste0(saveDir, "KalmiaFruitDiameter_trt14Only.pdf"), width = 5, height = 4)
Model Fruit Size with LMER
size1 <- within(kfrt, {
trt <- mapvalues(trt, c("Bagged", "Bagged & Selfed", "Unbagged",
"Unbagged & Outcrossed", "Unbagged_2"),
c("Bagged", "Bagged\n & Selfed", "Control",
"Outcrossed", "Control_2"))
})
size1 <- droplevels(size1[size1$trt != "Control_2",])
# get sample sizes for size dataset
nrow(size1) # total number of fruits in analysis for size
## [1] 1035
data.frame(table(size1$accessNum)) # total number of fruits per plant
## Var1 Freq
## 1 1129_74_A 93
## 2 1129_74_B 36
## 3 1129_74_C 69
## 4 1129_74_E 43
## 5 1129_74_F 48
## 6 12_2006_A 22
## 7 128_96_MASS 64
## 8 132_98_MASS 17
## 9 150_58_A 39
## 10 232_46_A 13
## 11 35_40_C 70
## 12 39_60_A 37
## 13 425_74_D 94
## 14 440_66_A 53
## 15 46_18_A 19
## 16 572_64_MASS 31
## 17 624_64_B 43
## 18 673_69_B 124
## 19 677_66_MASS 69
## 20 685_70_A 23
## 21 721_79_B 10
## 22 UNLABELED_1 18
size1$trt <- relevel(as.factor(size1$trt), ref = "Control")
f1 <- lmer(dia_mm ~ trt + (1|accessNum), data = size1)
summary(f1) # final model for paper
## Linear mixed model fit by REML ['lmerMod']
## Formula: dia_mm ~ trt + (1 | accessNum)
## Data: size1
##
## REML criterion at convergence: 1859.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9129 -0.6393 0.0309 0.6617 3.4872
##
## Random effects:
## Groups Name Variance Std.Dev.
## accessNum (Intercept) 0.1947 0.4412
## Residual 0.3264 0.5713
## Number of obs: 1035, groups: accessNum, 22
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 4.35455 0.10232 42.56
## trtBagged -0.51606 0.07280 -7.09
## trtBagged\n & Selfed 0.01786 0.05313 0.34
## trtOutcrossed 0.73112 0.04887 14.96
##
## Correlation of Fixed Effects:
## (Intr) trtBgg trtB&S
## trtBagged -0.193
## trtBggd&Slf -0.260 0.387
## trtOutcrssd -0.306 0.403 0.535
# get p-value for trt
f2 <- update(f1, .~. - trt)
anova(f1, f2, "LRT")
## refitting model(s) with ML (instead of REML)
## Data: size1
## Models:
## f2: dia_mm ~ (1 | accessNum)
## f1: dia_mm ~ trt + (1 | accessNum)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## f2 3 2233.5 2248.3 -1113.74 2227.5
## f1 6 1856.3 1886.0 -922.16 1844.3 383.17 3 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Fruit Size Diagnostics
# diagnostics
# qq plot
qqnorm(resid(f1), main = "")
qqline(resid(f1)) # good

# residual plot
plot(fitted(f1), residuals(f1, type = "deviance"), xlab = "fitted", ylab = "residuals")
abline(0,0) # residuals look better for this model

# QQPlot for group-level effects
qqnorm(ranef(f1)$accessNum[[1]], main="Normal Q-Q plot for random effects")
qqline(ranef(f1)$accessNum[[1]]) # looks good

infl <- influence(f1, obs = TRUE) # takes a while to calculate
plot(infl, which = 'cook') # nothing above 1

# visualize model:
sjp.lmer(f1, type = 'fe')
## Computing p-values via Kenward-Roger approximation. Use `p.kr = FALSE` if computation takes too long.

# Best Linear Unbiased Predictors (BLUPs)
sjp.lmer(f1, type = 're', sort = TRUE) # plot random effects to look for outliers
## Plotting random effects...

# post-hoc pairwise comparisons with adjusted p-values
# from documentation:
# test = adjusted() multiple test procedures as specified by the type argument
# to adjusted: "single-step" denotes adjusted p values as computed
# from the joint normal or t distribution of the z statistics (default),
summary(glht(f1, lsm(pairwise ~ trt)), test=adjusted("single-step")) # pairwise tests for fruit counts
## Loading required namespace: lmerTest
## Note: df set to 1018
##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lmer(formula = dia_mm ~ trt + (1 | accessNum), data = size1)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## Control - Bagged == 0 0.51606 0.07280 7.089 <1e-06
## Control - Bagged\n & Selfed == 0 -0.01786 0.05313 -0.336 0.987
## Control - Outcrossed == 0 -0.73112 0.04887 -14.960 <1e-06
## Bagged - Bagged\n & Selfed == 0 -0.53392 0.07163 -7.454 <1e-06
## Bagged - Outcrossed == 0 -1.24718 0.06945 -17.958 <1e-06
## Bagged\n & Selfed - Outcrossed == 0 -0.71326 0.04934 -14.456 <1e-06
##
## Control - Bagged == 0 ***
## Control - Bagged\n & Selfed == 0
## Control - Outcrossed == 0 ***
## Bagged - Bagged\n & Selfed == 0 ***
## Bagged - Outcrossed == 0 ***
## Bagged\n & Selfed - Outcrossed == 0 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
Visualize Fruit Size with annotations
# refit m1.1 with treatments in alphabetical order (b/c order isn't preserved in model matrix)
sf1<- within(size1, {
trt = as.factor(as.character(size1$trt))
})
# refit model with different reference levels
f1 <- lmer(dia_mm ~ trt + (1|accessNum), data = sf1)
pframe <- data.frame(trt = levels(droplevels(sf1$trt)))
pframe$dia_mm = 0
pp <- predict(f1, newdata = pframe, re.form=NA, type = 'response') # re.form sets all random effects to 0
### Calculate CI's (using bootstrap, not accounting for random effects)
mm <- model.matrix(terms(f1), pframe)
predFun<-function(.) mm%*%fixef(.)
bb<-bootMer(f1,FUN=predFun,nsim=200) #do this 200 times
# get quantiles from bootstrap sample
bb_se<-apply(bb$t,2,function(x) quantile(x, probs = c(0.025, 0.975)))
pframe$blo<-bb_se[1,]
pframe$bhi<-bb_se[2,]
pframe$predMean <- pp
pframe # print frame, in case reporting in a table
## trt dia_mm blo bhi predMean
## 1 Bagged 0 3.616601 4.030417 3.838494
## 2 Bagged\n & Selfed 0 4.167830 4.541833 4.372409
## 3 Control 0 4.157682 4.552634 4.354550
## 4 Outcrossed 0 4.901570 5.270815 5.085670
pframe$trt <- relevel(pframe$trt, ref = "Control")
#plot 95% confidence intervals
# "Mean and bootstrap CI based on fixed-effects uncertainty ONLY"
g2 <- ggplot(pframe, aes(x=trt, y=predMean))+
geom_point()+
labs(y = "Fruit dia. (mm)", x = "Treatment") +
geom_errorbar(aes(ymin = blo, ymax = bhi), width = 0.1)+
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = 'none') +
# annotate
annotate(geom="text", x=c(1,2,3,4), y=c(5,5,5,5) + 0.7, label=c("a", "b", "a", "c"),
color="black")
g2

ggsave(paste0(saveDir, "KalmiaFruitDia_BSCI.pdf"), width = 3, height = 4)
## visualize raw data (mean fruit size per plant)
sdf1 <- droplevels(within(sizeDF_mean[!(sizeDF_mean$trt %in% 'Control_2'), ], {
trt <- mapvalues(trt, c("Bagged", "Bagged & Selfed", "Control",
"Unbagged & Outcrossed", "Control_2"),
c("Bagged", "Bagged\n & Selfed", "Control",
"Outcrossed", "Control_2"))
}))
# visualize fruit size
ggplot(sdf1, aes(x = trt, y = meanFrtSz)) +
geom_boxplot(width = 0.2, fill = 'grey80') +
labs(y = "Fruit dia. (mm)", x = "Treatment") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = 'none') +
# annotate
annotate(geom="text", x=c(1,2,3,4), y=c(5,5,5,5) + 2, label=c("a", "b", "a", "c"),
color="black")

ggsave(paste0(saveDir, "KalmiaFruitDia.pdf"), width = 3, height = 4)
# compare two plots
g2 + geom_boxplot(data = sdf1, aes(x = trt, y = meanFrtSz), width = 0.2, alpha = 0)

Visualize data to compare fruit size with number of seeds
# read in data
sizeSeed <- read.csv("KalmiaFruitSizeAndSeeds.csv")
ggplot(sizeSeed, aes(x = Dia..mm., y = NumSeeds)) +
geom_point() +
stat_smooth(method = 'lm', formula = y ~ exp(x), se = F) +
labs(x = 'Fruit Diameter (mm)', y = 'Num Seeds in 1 Carpel')

ggsave(paste0(saveDir, "KalmiaFruitSeeds.pdf"), width = 5, height = 4)
# on log scale
ggplot(sizeSeed, aes(x = Dia..mm., y = NumSeeds)) +
geom_point() +
stat_smooth(method = 'lm', formula = y ~ x, se = F) +
scale_y_continuous(trans="log") +
labs(y = "log(e) number of seeds")

# GLM
ss1 <- glm(NumSeeds ~ Dia..mm., family = poisson, data = sizeSeed)
summary(ss1)
##
## Call:
## glm(formula = NumSeeds ~ Dia..mm., family = poisson, data = sizeSeed)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.1484 -1.3123 0.0538 1.1869 3.4399
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22136 0.32470 0.682 0.495
## Dia..mm. 0.63356 0.07019 9.027 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 156.083 on 18 degrees of freedom
## Residual deviance: 68.423 on 17 degrees of freedom
## AIC: 160.53
##
## Number of Fisher Scoring iterations: 4
ss1$deviance / ss1$df.residual # overdispersed
## [1] 4.024875
# Neg Binomial Model
ss2 <- glm.nb(NumSeeds ~ Dia..mm., data = sizeSeed)
summary(ss2) # final model for paper
##
## Call:
## glm.nb(formula = NumSeeds ~ Dia..mm., data = sizeSeed, init.theta = 7.72630546,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.03651 -0.83952 0.02891 0.59012 1.60618
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2739 0.5660 0.484 0.628
## Dia..mm. 0.6215 0.1292 4.811 1.5e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(7.7263) family taken to be 1)
##
## Null deviance: 43.384 on 18 degrees of freedom
## Residual deviance: 18.964 on 17 degrees of freedom
## AIC: 135.72
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 7.73
## Std. Err.: 3.54
##
## 2 x log-likelihood: -129.72
# get overall test stat for dia
ss3 <-update(ss2, .~. - Dia..mm.)
anova(ss2, ss3) # overall p-value for Dia..mm.
## Likelihood ratio tests of Negative Binomial Models
##
## Response: NumSeeds
## Model theta Resid. df 2 x log-lik. Test df LR stat.
## 1 1 2.828473 18 -145.5067
## 2 Dia..mm. 7.726305 17 -129.7204 1 vs 2 1 15.78629
## Pr(Chi)
## 1
## 2 7.091441e-05
## visualize CI's for the two different models
newD <- data.frame(Dia..mm. = seq(2, 6, length.out = 100))
nbPreds = predict(ss2, newD, type = 'response', se.fit = TRUE)
nd1 <- data.frame(newD, nppred = nbPreds$fit, npse = nbPreds$se.fit)
predsPois <- predict(ss1, newD, type = 'response', se.fit = TRUE)
n2 <- data.frame(nd1, poisPred = predsPois$fit, poisse = predsPois$se.fit)
n2$npUpper <- with(n2, nppred + 2 * npse)
n2$npLower <- with(n2, nppred - 2 * npse)
n2$pUpper <- with(n2, poisPred + 2 * poisse)
n2$pLower <- with(n2, poisPred - 2 * poisse)
# compare NB errors vs. Pois errors
ggplot(sizeSeed, aes(x = Dia..mm., y = NumSeeds)) +
geom_point() +
geom_line(data = n2, aes(x = Dia..mm., y = poisPred), color = 'blue') +
geom_line(data = n2, aes(x = Dia..mm., y = pUpper), color = 'blue', linetype = 2) +
geom_line(data = n2, aes(x = Dia..mm., y = pLower), color = 'blue', linetype = 2) +
labs(x = 'Fruit Diameter (mm)', y = 'Num Seeds in 1 Carpel') +
geom_line(data = n2, aes(x = Dia..mm., y = nppred), color = 'red') +
geom_line(data = n2, aes(x = Dia..mm., y = npUpper), color = 'red', linetype = 3) +
geom_line(data = n2, aes(x = Dia..mm., y = npLower), color = 'red', linetype = 3)

## model diagnostics for negative binomial model
plot(ss2, which = 4) #cook's distance

plot(y = residuals(ss2, type = 'deviance'), x = ss2$fitted.values)
abline(h = 0)

# should be close to 1
ss2$deviance / ss2$df.residual
## [1] 1.115512
# get sample size
nrow(sizeSeed) # total number of individual fruits
## [1] 19
sum(sizeSeed$NumSeeds) # total number of seeds counted
## [1] 382
Print sesion info
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] lsmeans_2.25 estimability_1.2 multcomp_1.4-6
## [4] TH.data_1.0-8 MASS_7.3-45 survival_2.40-1
## [7] mvtnorm_1.0-5 sjPlot_2.2.1 influence.ME_0.9-8
## [10] plyr_1.8.4 lme4_1.1-12 Matrix_1.2-8
## [13] ggplot2_2.2.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.9 stringdist_0.9.4.4 lattice_0.20-34
## [4] tidyr_0.6.1 zoo_1.7-14 lmtest_0.9-34
## [7] assertthat_0.1 rprojroot_1.2 digest_0.6.12
## [10] psych_1.6.12 mime_0.5 R6_2.2.0
## [13] backports_1.0.5 stats4_3.3.2 evaluate_0.10
## [16] coda_0.19-1 lazyeval_0.2.0 minqa_1.2.4
## [19] nloptr_1.0.4 DT_0.2 rmarkdown_1.3
## [22] labeling_0.3 splines_3.3.2 stringr_1.1.0
## [25] foreign_0.8-67 htmlwidgets_0.8 munsell_0.4.3
## [28] shiny_1.0.0 broom_0.4.1 httpuv_1.3.3
## [31] modelr_0.1.0 mnormt_1.5-5 htmltools_0.3.5
## [34] nnet_7.3-12 tibble_1.2 coin_1.1-3
## [37] codetools_0.2-15 dplyr_0.5.0 sjmisc_2.2.1
## [40] grid_3.3.2 nlme_3.1-130 arm_1.9-3
## [43] xtable_1.8-2 gtable_0.2.0 DBI_0.5-1
## [46] magrittr_1.5 scales_0.4.1 stringi_1.1.2
## [49] reshape2_1.4.2 effects_3.1-2 sandwich_2.3-4
## [52] blme_1.0-4 RColorBrewer_1.1-2 tools_3.3.2
## [55] purrr_0.2.2 sjstats_0.7.1 abind_1.4-5
## [58] parallel_3.3.2 yaml_2.1.14 colorspace_1.3-2
## [61] knitr_1.15.1 haven_1.0.0 merTools_0.3.0
## [64] modeltools_0.2-21
# print time of that html document was created
Sys.time()
## [1] "2017-02-11 17:27:49 EST"